library(survival)
library(maxstat)
library(survminer)
library(tidyverse)
library(MultiAssayExperiment)
library(EnvStats)
library(readxl)
source("data/Figure_layouts.R")
load("data/CLL_Proteomics_Setup.RData")
load("data/CLL_Proteomics_ConsensusClustering.RData")
load("data/proteomics_model_noDrug_factors.RData")
colData(multiomics_MAE)$PG <- as.factor(CCP_group5[rownames(colData(multiomics_MAE))])
This functions extract information from cox model objects
Extract.Cox <- function(x)
{
x <- summary(x)
p.value<-signif(x$coefficients[1, 5], digits=2)
beta<-signif(x$coefficients[1, 1], digits=2);#coeficient beta
HR <-signif(x$coefficients[1, 2], digits=2);#exp(beta)
wald.test<-signif(x$waldtest["test"], digits=2)
HR.confint.lower <- signif(x$conf.int[1 ,"lower .95"], 2)
HR.confint.upper <- signif(x$conf.int[1 ,"upper .95"], 2)
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res<-c(beta, HR, wald.test, p.value)
names(res)<-c("beta", "HR (95% CI for HR)", "wald.test",
"p.value")
return(res)
}
This function calculate univeriate/multivariate cox models
Calc.Cox <- function(formula, covariates, dataframe, MultiVars=NULL) {
if(!is_empty(MultiVars)) {z <- paste(" +", MultiVars, collapse="")}
else{z <- NULL}
# Define formulas for univariate model
cox_formulas <- sapply(covariates,
function(y) as.formula(paste(formula, y, z)))
# Calculate univariate model for all formulas
cox_models <- lapply(cox_formulas,
function(z){coxph(z, data = dataframe)})
# Extract data from summary objects
cox_results <- lapply(cox_models, Extract.Cox)
# Transform data to data frame
results <- t(as.data.frame(cox_results, check.names = FALSE)) %>%
as.data.frame()
results$p.value <- as.character(results$p.value) %>% as.numeric()
results$p.value_adj <- p.adjust(results$p.value, method = "BH")
return(results)
}
Results <- list()
# Protein
prot <- multiomics_MAE@ExperimentList$proteomics
# Remove all proteins which harbor any NA
prot <- prot[rowSums(is.na(prot))==0, ]
# Remove '-' as it makes problems in formula objects
rownames(prot) <- gsub(rownames(prot), pattern = "-", replacement = "_")
df <- as.data.frame(colData(multiomics_MAE))
# Set 'time' either to observation time (w/o next treatment) or to actual time to next treatment
# Status: 1 for patients w/o next treatment ('censored'), 2 for patients who actually recieved treatment
df <- df %>%
dplyr::rename(TTNT = timeDiff_TTNT_orig) %>%
mutate(time = ifelse(is.na(TTNT), ObsTime, TTNT),
status = ifelse(is.na(TTNT), 1, 2))
# Remove without any follow up
df <- filter(df, ObsTime!=0) %>% filter(!treatment_status=="intreatment")
df_short <- filter(df, patient_ID %in%
colnames(multiomics_MAE@ExperimentList$proteomics))
df_short <- dplyr::select(df_short, patient_ID, IGHV, trisomy12, time, status)
# Merge protein data with TTNT within one data frame
prot_surv <- t(prot) %>% as.data.frame() %>%
rownames_to_column(var = "patient_ID") %>%
right_join(df_short, by="patient_ID")
print(nrow(prot_surv))
## [1] 61
# Apply Calc.Cox
res_prot <- Calc.Cox(formula = 'Surv(time, status)~',
covariates = rownames(prot),
dataframe = prot_surv)
# Top 100 proteins (lowest p value)
res_prot %>% rownames_to_column(var = "Symbol") %>%
top_n(100, -p.value) %>% arrange(p.value)
# P value histogram
hist(res_prot$p.value, breaks = 100)
# Save object in list
Results[["Univ.prot_TTNT"]] <- res_prot
message("Number proteins significant (FDR = 5%) associated with TTNT")
sum(res_prot$p.value_adj < 0.05)
## [1] 605
message("Percent proteins significant (FDR = 5%) associated with TTNT")
sum(res_prot$p.value_adj < 0.05) / length(res_prot$p.value_adj)
## [1] 0.0828
# Filter for patients with complete data
df_short <-
filter(df, patient_ID %in% rownames(proteomics_model_noDrug_factors)) %>%
dplyr::select(patient_ID, time, status, IGHV, trisomy12)
# Merge protein data with TTNT within one data frame
LF_surv <- proteomics_model_noDrug_factors %>% as.data.frame() %>%
rownames_to_column(var = "patient_ID") %>%
right_join(df_short, by="patient_ID")
print(nrow(LF_surv))
## [1] 72
# Apply Calc.Cox
res_LF <- Calc.Cox(formula = 'Surv(time, status)~',
covariates = colnames(proteomics_model_noDrug_factors),
dataframe = LF_surv)
# Top 100 genes (lowest p value)
res_LF %>%
rownames_to_column(var = "LF") %>%
arrange(p.value)
# Save object in list
Results[["Univ.LF_TTNT"]] <- res_LF
KM.Plot <- function(dataset, name, plot=TRUE) {
# Filter for patients with complete data
df_short <- df %>%
filter(patient_ID %in% rownames(dataset)) %>%
dplyr::select(patient_ID, time, status)
# Merge protein data with TTNT within one data frame
df_surv <- dataset %>%
as.data.frame() %>%
rownames_to_column(var = "patient_ID") %>%
right_join(df_short, by="patient_ID") %>%
dplyr::select(time, status, name) %>%
dplyr::rename(Factor = name) %>%
mutate(BinFactor = Factor)
formel <- as.formula(paste0("Surv(time, status) ~ Factor"))
# Test if dataset is discrete or continuous
x1 <- dataset %>% as.numeric() %>% unique() %>% na.omit() %>% length()
x2 <- dataset %>% as.numeric() %>% na.omit() %>% length()
# If dataset is continuous, find optimal Cut-Off
if(x1/x2 > 0.1) {
mxs.obj <- maxstat.test(Surv(time, status) ~ Factor, data=df_surv,
smethod="LogRank", pmethod="exactGauss",
minprop = 0.25, maxprop=0.75, abseps=0.01)
df_surv <- mutate(df_surv, BinFactor=ifelse(Factor > mxs.obj$estimate, "High", "Low"))
}
df_surv$time <- df_surv$time/365
fit <- survfit(Surv(time, status) ~ BinFactor, data = df_surv)
if(plot==TRUE)
{
surv_plot <- ggsurvplot(fit, data = df_surv,
pval = TRUE,
pval.size = 4,
pval.coord = c(1800, 0.95),
xlab = "Time in years",
palette = c("#E7B800", "#2E9FDF"),
legend.labs = gsub("BinFactor=", "", names(fit$strata) ),
ylab="Treatment free survival",
legend.title = name,
legend = "top",
ggtheme = theme_bw())
print(surv_plot)
return(surv_plot)
} else
{
l <- list(fit, df_surv)
names(l) <- c("fit", "df")
return(l)
}
}
df <- as.data.frame(colData(multiomics_MAE))
# Set 'time' either to observation time (w/o next treatment) or to actual time to next treatment
# Status: 1 for patients w/o next treatment ('censored'), 2 for patients who actually recieved treatment
df <- df %>%
dplyr::rename(TTNT = timeDiff_TTNT_orig) %>%
mutate(time = ifelse(is.na(TTNT), ObsTime, TTNT),
status = ifelse(is.na(TTNT), 1, 2))
# Remove without any follow up
df <- filter(df, ObsTime!=0) %>% filter(!treatment_status=="intreatment")
df_TTNT <- df
sig_prot_univ <- Results$Univ.prot_TTNT %>% as_tibble()
sig_prot_univ$hgnc_symbol <- rownames(Results$Univ.prot_TTNT)
df <- df_TTNT
PIK3CD_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "PIK3CD")
PLCG2_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "PLCG2")
CD20_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "MS4A1")
SAMHD1_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "SAMHD1")
FCRL2_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "FCRL2")
CD40_surv_plot <- KM.Plot(dataset = t(multiomics_MAE@ExperimentList$proteomics), name = "CD40")
HR_BCR_volcano <- sig_prot_univ %>%
separate(`HR (95% CI for HR)`, into=c("HR_prot", "95% CI for HR"), sep = " \\(") %>%
mutate(HR_prot=as.numeric(HR_prot), `95% CI for HR`=gsub("\\)", "", `95% CI for HR`) ) %>%
ggplot(aes(log2(HR_prot), -log10(p.value) )) +
geom_point(data = . %>% filter(p.value_adj >= 0.05), color="lightgray", alpha=0.5) +
geom_point(data = . %>% filter(p.value_adj < 0.05), color="#0571b0", alpha=0.5 ) +
geom_point(data = . %>% filter( hgnc_symbol %in% BCR_genes ), color="orange1" ) +
pp_sra
HR_BCR_volcano + ggtitle("Proteins ~ TTNT") + coord_cartesian(ylim=c(0.2,7.3)) +
ggrepel::geom_label_repel(data = . %>% filter(p.value_adj < 0.05, hgnc_symbol %in% BCR_genes),
aes(label= hgnc_symbol), size=3)
sig_prot_univ %>% transmute(hgnc_symbol, rank= base::rank(p.value, ties.method = "first") ) #%>% write.table("/Users/sophierabe/Desktop/PhD/Labor/Proteomics/CLL/GSEA_CLL_Proteomics/191125_TTNT_Protein_GSEAlist.rnk", quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t")
LF_HR_plot <-
Results[["Univ.LF_TTNT"]] %>%
separate(col = `HR (95% CI for HR)`, into = c("HR", "CI"), sep = "\\(" ) %>%
mutate(CI = gsub(")", "", CI )) %>%
separate(CI, into = c("lower_CI", "upper_CI"), sep = "-" ) %>%
mutate("Hazard ratio" = as.numeric(HR), lower_CI = as.numeric(lower_CI), upper_CI= as.numeric(upper_CI), LF= rownames(.)) %>%
mutate("Latent factor" = as.factor(gsub("LF", "", LF ) )) %>%
mutate("Latent factor" = factor(`Latent factor`, levels = c(1:nrow(Results[["Univ.LF_TTNT"]])) ) ) %>%
mutate(sig = if_else( (lower_CI < 1 & upper_CI <1) | (lower_CI > 1 & upper_CI >1), "significant", "NS" )) %>%
ggplot(aes( `Latent factor`, `Hazard ratio` )) +
geom_point(aes(color=sig)) +
geom_linerange(aes( ymin=lower_CI, ymax= upper_CI, color= sig )) +
scale_color_manual(values = c("black", "blue")) +
pp_sra +
coord_flip(ylim = c(0.22, 4.5)) +
geom_hline(yintercept = 1, linetype= "dashed") +
scale_y_log10()
LF_HR_plot +
geom_text(y= 0.6, aes(label= p.value, color=sig )) +
theme(legend.position = 'none')
save( HR_BCR_volcano, PIK3CD_surv_plot, PLCG2_surv_plot,
LF_HR_plot, CD20_surv_plot, SAMHD1_surv_plot, FCRL2_surv_plot, CD40_surv_plot,
file = "RData_plots/CLL_Proteomics_TTNT_Plots.RData")
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] DESeq2_1.28.1 readxl_1.3.1
## [3] EnvStats_2.3.1 MultiAssayExperiment_1.14.0
## [5] SummarizedExperiment_1.18.2 DelayedArray_0.14.1
## [7] matrixStats_0.56.0 Biobase_2.48.0
## [9] GenomicRanges_1.40.0 GenomeInfoDb_1.24.2
## [11] IRanges_2.22.2 S4Vectors_0.26.1
## [13] BiocGenerics_0.34.0 forcats_0.5.0
## [15] stringr_1.4.0 dplyr_1.0.2
## [17] purrr_0.3.4 readr_1.3.1
## [19] tidyr_1.1.2 tibble_3.0.3
## [21] tidyverse_1.3.0 survminer_0.4.8
## [23] ggpubr_0.4.0 ggplot2_3.3.2
## [25] maxstat_0.7-25 survival_3.2-3
## [27] knitr_1.29 BiocStyle_2.16.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.1
## [4] rio_0.5.16 XVector_0.28.0 base64enc_0.1-3
## [7] fs_1.5.0 rstudioapi_0.11 farver_2.0.3
## [10] ggrepel_0.8.2 bit64_4.0.5 AnnotationDbi_1.50.3
## [13] fansi_0.4.1 mvtnorm_1.1-1 lubridate_1.7.9
## [16] xml2_1.3.2 splines_4.0.2 geneplotter_1.66.0
## [19] jsonlite_1.7.1 broom_0.7.0 km.ci_0.5-2
## [22] annotate_1.66.0 dbplyr_1.4.4 BiocManager_1.30.10
## [25] compiler_4.0.2 httr_1.4.2 backports_1.1.9
## [28] assertthat_0.2.1 Matrix_1.2-18 cli_2.0.2
## [31] htmltools_0.5.0 tools_4.0.2 gtable_0.3.0
## [34] glue_1.4.2 GenomeInfoDbData_1.2.3 Rcpp_1.0.5
## [37] carData_3.0-4 cellranger_1.1.0 vctrs_0.3.4
## [40] exactRankTests_0.8-31 xfun_0.17 openxlsx_4.1.5
## [43] rvest_0.3.6 lifecycle_0.2.0 rstatix_0.6.0
## [46] XML_3.99-0.5 zlibbioc_1.34.0 zoo_1.8-8
## [49] scales_1.1.1 hms_0.5.3 RColorBrewer_1.1-2
## [52] yaml_2.2.1 curl_4.3 memoise_1.1.0
## [55] gridExtra_2.3 KMsurv_0.1-5 stringi_1.5.3
## [58] RSQLite_2.2.0 genefilter_1.70.0 zip_2.1.1
## [61] BiocParallel_1.22.0 rlang_0.4.7 pkgconfig_2.0.3
## [64] bitops_1.0-6 evaluate_0.14 lattice_0.20-41
## [67] labeling_0.3 bit_4.0.4 tidyselect_1.1.0
## [70] magrittr_1.5 bookdown_0.20 R6_2.4.1
## [73] magick_2.4.0 generics_0.0.2 DBI_1.1.0
## [76] pillar_1.4.6 haven_2.3.1 foreign_0.8-80
## [79] withr_2.2.0 abind_1.4-5 RCurl_1.98-1.2
## [82] modelr_0.1.8 crayon_1.3.4 car_3.0-9
## [85] survMisc_0.5.5 rmarkdown_2.3 locfit_1.5-9.4
## [88] grid_4.0.2 data.table_1.13.0 blob_1.2.1
## [91] reprex_0.3.0 digest_0.6.25 xtable_1.8-4
## [94] munsell_0.5.0